In [118]:
import lifelines
import pymc as pm
from pyBMA.CoxPHFitter import CoxPHFitter
import matplotlib.pyplot as plt
import numpy as np
from math import log
import pandas as pd
%matplotlib inline
The first step in any data analysis is acquiring and munging the data An example data set can be found HERE
Download the file output.txt and transform it into a format like below where the event column should be 0 if there's only one entry for an id, and 1 if there are two entries:
id,time_to_convert,age,male,event
In [148]:
#Solution to part one:
def convert_to_minutes(dt):
day_diff = dt / np.timedelta64(1, 'D')
if day_diff == 0:
return 23.0
else:
return day_diff
base_df = pd.read_csv("E:/output.txt")
base_df["time_to_convert"] = pd.to_datetime(base_df['datetime'])
base_df = base_df.drop('datetime', 1)
time_deltas = base_df.groupby(by = "id").max() - base_df.groupby(by = "id").min()
df = time_deltas["time_to_convert"].apply(convert_to_minutes).to_frame()
grouped_base = base_df.groupby(by = "id").max()
df["age"] = grouped_base["age"]
df["male"] = grouped_base["male"]
df["search"] = grouped_base["search"]
df["brand"] = grouped_base["brand"]
df["event"] = df["time_to_convert"] == 23.0
df["event"] = df["event"].apply(lambda x: 0 if x else 1)
df["time_to_convert"].median()
Out[148]:
In [ ]:
###Parametric Bayes
#Shout out to Cam Davidson-Pilon
In [136]:
## Example fully worked model using toy data
## Adapted from http://blog.yhat.com/posts/estimating-user-lifetimes-with-pymc.html
alpha = pm.Uniform("alpha", 0,20)
beta = pm.Uniform("beta", 0,20)
obs = pm.Weibull('obs', alpha, beta, value = df["time_to_convert"], observed = True )
obs.random
@pm.potential
def censorfactor(obs=obs):
if np.any(obs>23 ):
return -100000
else:
return 0
mcmc = pm.MCMC([alpha, beta, obs, censorfactor ] )
mcmc.sample(5000, burn = 0, thin = 1)
In [137]:
pm.Matplot.plot(mcmc)
Problems:
1 - Work out the mean observed time to convert
2 - Try to fit your data from section 1
3 - Use the results to plot the distribution of the median
4 - Try adjusting the number of samples, the burn parameter and the amount of thinning to correct get good answers
5 - Try adjusting the prior and see how it affects the estimate
6 - Try to fit a different distribution to the data
7 - Compare answers
Bonus - test the hypothesis that the true median is greater than a certain amount
For question 2, note that the median of a Weibull is:
$$β(log 2)^{1/α}$$
In [141]:
#Solution to question 4:
def weibull_median(alpha, beta):
return beta * ((log(2)) ** ( 1 / alpha))
plt.hist([weibull_median(x[0], x[1]) for x in zip(mcmc.trace("alpha"), mcmc.trace("beta"))])
Out[141]:
In [139]:
#Solution to question 4:
### Increasing the burn parameter allows us to discard results before convergence
### Thinning the results removes autocorrelation
mcmc = pm.MCMC([alpha, beta, obs, censorfactor ] )
mcmc.sample(50000, burn = 30000, thin = 20)
pm.Matplot.plot(mcmc)
In [126]:
#Solution to Q5
## Adjusting the priors impacts the overall result
## If we give a looser, less informative prior then we end up with a broader, shorter distribution
## If we give much more informative priors, then we get a tighter, taller distribution
## Note the narrowing of the prior
#alpha = pm.Uniform("alpha", 2.5, 4.5)
#beta = pm.Uniform("beta", 14, 15)
####Uncomment this to see the result of looser priors
alpha = pm.Uniform("alpha", 0, 30)
beta = pm.Uniform("beta", 0, 30)
obs = pm.Weibull( 'obs', alpha, beta, value = df["time_to_convert"], observed = True )
mcmc = pm.MCMC([alpha, beta, obs, censorfactor ] )
mcmc.sample(10000, burn = 100, thin = 20)
pm.Matplot.plot(mcmc)
#plt.hist([weibull_median(x[0], x[1]) for x in zip(mcmc.trace("alpha"), mcmc.trace("beta"))])
In [127]:
#Solution to Q6
## To fit a new distribution to the data, we need to change the obs variable to a new distribution.
## This requires creating new hyper parameters,
## The normal distribution has two parameters, the mean and the stdev (we use 1/stdev in pymc)
import pymc as mc
import numpy.ma as ma
#this begins the model
alpha = pm.Uniform("mean", 0,15)
tau = pm.Uniform("tau", 0, 3)
obs = pm.Normal( 'obs', alpha, tau, value = df["time_to_convert"], observed = True )
mcmc = pm.MCMC([alpha, tau, obs, censorfactor ] )
mcmc.sample(50000, burn = 30000, thin = 20)
#pm.Matplot.plot(mcmc)
plt.hist(mcmc.trace("mean")[:])
Out[127]:
In [128]:
## Solution to bonus
## Super easy to do in the Bayesian framework, all we need to do is look at what % of samples
## meet our criteria
testing_value = 10
number_of_greater_samples = sum([x >= testing_value for x in mcmc.trace("mean")[:]])
100 * (number_of_greater_samples / len(mcmc.trace("mean")[:]))
Out[128]:
In [ ]:
#Cox model
If we want to look at covariates, we need a new approach. We'll use Cox proprtional hazards. More information here.
In [149]:
#Fitting solution
cf = lifelines.CoxPHFitter()
cf.fit(df, 'time_to_convert', event_col = 'event')
cf.summary
Out[149]:
Once we've fit the data, we need to do something useful with it. Try to do the following things:
1 - Plot the baseline survival function
2 - Predict the functions for a particular set of regressors
3 - Plot the survival function for two different set of regressors
4 - For your results in part 3 caculate how much more likely a death event is for one than the other for a given period of time
In [150]:
#Solution to 1
fig, axis = plt.subplots(nrows=1, ncols=1, sharex=True, sharey = True)
cf.baseline_survival_.plot(ax = axis, title = "Baseline Survival")
Out[150]:
In [151]:
# Solution to prediction
regressors = np.array([[45,0,0,0]])
survival = cf.predict_survival_function(regressors)
survival
Out[151]:
In [156]:
#Solution to plotting multiple regressors
fig, axis = plt.subplots(nrows=1, ncols=1, sharex=True)
regressor1 = np.array([[18,0,0,1]])
regressor2 = np.array([[56,0,0,1]])
survival_1 = cf.predict_survival_function(regressor1)
survival_2 = cf.predict_survival_function(regressor2)
plt.plot(survival_1,label = "32 year old male")
plt.plot(survival_2,label = "46 year old female")
plt.legend(loc = "lower left")
Out[156]:
In [158]:
#Difference in survival
odds = survival_2 / survival_1
plt.plot(odds, c = "red")
Out[158]:
Model selection
Difficult to do with classic tools (here)
Problem:
1 - Calculate the BMA coefficient values
2 - Compare these results to past the lifelines results
3 - Try running with different priors
In [ ]:
##Solution to 1
bmaCox = CoxPHFitter()
bmaCox.fit(df, "time_to_convert", event_col= "event", priors= [0.5]*7)
In [ ]:
print(coefficient)
In [ ]:
difference = coefficient - cf.summary["coef"]
print(difference)
print("Most very close, some significantly out (because no leaps and jumps)")